In [1]:
%load_ext nb_black
In [2]:
import pandas as pd
from datetime import timedelta, datetime
import plotly.express as px
from sklearn.linear_model import LinearRegression
import numpy as np
from plotly import graph_objects as go
In [3]:
data = pd.read_csv("owid-covid-data.csv")
data = data[data["location"] == "Russia"]
data.new_cases[data.new_cases == 0] = 1
data["date"] = pd.to_datetime(data["date"], format="%Y-%m-%d")
data.index = data["date"]
data.drop(columns=["date"], inplace=True)

start_date = datetime(year=2020, month=3, day=3)
stop_train_date = start_date + timedelta(days=50)
data = data[(data.index >= start_date)]
# data = data[(data.index >= start_date) & (data.index < start_date + timedelta(days=50))]
# data = data[data.columns.difference(data.columns[data.isna().all()].tolist())]

target = data[["total_cases", "new_cases"]]
data.drop(columns=["total_cases", "new_cases"], inplace=True)
display(data.head())
display(target.head())
iso_code continent location new_cases_smoothed total_deaths new_deaths new_deaths_smoothed total_cases_per_million new_cases_per_million new_cases_smoothed_per_million ... male_smokers handwashing_facilities hospital_beds_per_thousand life_expectancy human_development_index population excess_mortality_cumulative_absolute excess_mortality_cumulative excess_mortality excess_mortality_cumulative_per_million
date
2020-03-03 RUS Europe Russia 0.143 NaN 0.0 0.0 0.021 0.007 0.001 ... 58.3 NaN 8.05 72.58 0.824 144713312.0 NaN NaN NaN NaN
2020-03-04 RUS Europe Russia 0.143 NaN 0.0 0.0 0.021 0.000 0.001 ... 58.3 NaN 8.05 72.58 0.824 144713312.0 NaN NaN NaN NaN
2020-03-05 RUS Europe Russia 0.143 NaN 0.0 0.0 0.021 0.000 0.001 ... 58.3 NaN 8.05 72.58 0.824 144713312.0 NaN NaN NaN NaN
2020-03-06 RUS Europe Russia 0.286 NaN 0.0 0.0 0.028 0.007 0.002 ... 58.3 NaN 8.05 72.58 0.824 144713312.0 NaN NaN NaN NaN
2020-03-07 RUS Europe Russia 0.286 NaN 0.0 0.0 0.028 0.000 0.002 ... 58.3 NaN 8.05 72.58 0.824 144713312.0 NaN NaN NaN NaN

5 rows × 64 columns

total_cases new_cases
date
2020-03-03 3.0 1.0
2020-03-04 3.0 1.0
2020-03-05 3.0 1.0
2020-03-06 4.0 1.0
2020-03-07 4.0 1.0
In [4]:
px.line(target[target.index < stop_train_date], markers=True)
In [5]:
(
    f"last total: {target[target.index < stop_train_date].total_cases[-1]}",
    f"sum new: {sum(target[target.index < stop_train_date].new_cases)}",
)
Out[5]:
('last total: 52763.0', 'sum new: 52772.0')

Очевидно, что new_cases можно пересчитать в total_cases (есть незначительные ошибки в данных). Но, new_cases может убывать, а т.к. мы априори берём неубывающие функции, логично пользоваться именно total_cases.

In [6]:
df = target[["new_cases", "total_cases"]]
df["x"] = np.arange(df.shape[0])
assert all(
    timedelta(days=1) == d for d in df.index[1:] - df.index[:-1]
)  # проверка, что данные есть за каждый день
/tmp/ipykernel_140/346176215.py:2: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

In [7]:
train = df[target.index < start_date + timedelta(days=50)]
test = df[
    (df.index >= start_date + timedelta(days=50))
    & (df.index <= datetime(year=2020, month=9, day=1))
]
In [8]:
px.line(test.total_cases, markers=True)

Первая модель имеет вид $$ y=c_1e^{c_2x+c_3} $$, тогда $$ ln(y)=(c_2x+c_3)ln(c_1e)$$ $$ln(y)=ax+b$$ Поэтому нужно прологорифмировать данные

In [9]:
model = LinearRegression()
x_train = train.x.to_numpy().reshape(-1, 1)
y_train = train.total_cases.to_numpy().flatten()
model = model.fit(x_train, np.log(y_train))
f"ln(y)={model.intercept_:.3}+{model.coef_[0]:.2}x"
Out[9]:
'ln(y)=1.19+0.21x'
In [10]:
pred_train = np.exp(model.predict(x_train))
fig = go.Figure()
fig.add_scatter(y=pred_train, name="predict", x=train.index)
fig.add_scatter(y=y_train, name="real", x=train.index)
In [11]:
sigma_noise = np.std(np.log(y_train) - np.log(pred_train))
sigma_noise
Out[11]:
0.4498899109116323
In [12]:
def bayesian_update(mu, sigma, x, y, sigma_noise):
    x_matrix = np.array([np.hstack(([1], x))])
    sigma_n = np.linalg.inv(
        np.linalg.inv(sigma)
        + (1 / (sigma_noise**2)) * np.matmul(np.transpose(x_matrix), x_matrix)
    )
    mu_n = np.matmul(
        sigma_n,
        np.matmul(np.linalg.inv(sigma), np.transpose(mu))
        + (1 / (sigma_noise**2)) * np.matmul(np.transpose(x_matrix), np.array([y])),
    )
    return mu_n, sigma_n
In [13]:
def bayesian_update_loop(x, y, sigma_noise):
    mu = np.zeros(x.shape[1] + 1)
    sigma = 2 * np.diag(np.ones(x.shape[1] + 1))

    for obj, val in zip(x, y):
        mu, sigma = bayesian_update(mu, sigma, obj, np.log(val), sigma_noise)

    return mu, sigma
In [14]:
mu, sigma = bayesian_update_loop(x=x_train, y=y_train, sigma_noise=sigma_noise)
mu, sigma
Out[14]:
(array([1.18416525, 0.21438625]),
 array([[ 1.55931945e-02, -4.72519863e-04],
        [-4.72519863e-04,  1.93255611e-05]]))
In [15]:
samples_count = 50
In [16]:
def draw_samples(fig, data, mu, sigma, count=samples_count):
    pd.options.mode.chained_assignment = None
    np.random.seed(0)
    samples = np.random.multivariate_normal(mu, sigma, count)
    for idx, (b, a) in enumerate(samples):
        data.loc[:, f"sample_{idx}"] = np.exp(a * data.x.to_numpy().flatten() + b)
        fig.add_scatter(
            y=data[f"sample_{idx}"],
            showlegend=False,
            line_color="grey",
            opacity=0.1,
            x=data.index,
        )


fig = go.Figure()
draw_samples(fig, train, mu, sigma)
fig.add_scatter(y=pred_train, name="predict", line_color="green", x=train.index)
fig.add_scatter(y=y_train, name="real", x=train.index)
fig.update_layout(title="Данные на обучении")
In [17]:
y_test = test.total_cases.to_numpy().flatten()

fig = go.Figure()
draw_samples(fig, test, mu, sigma)
fig.add_scatter(y=y_test, name="real", x=test.index)
fig.update_layout(title="Данные на тесте")
fig.update_layout(yaxis_range=[0, 1e6])
In [18]:
samples = test.loc["2020-5-1"].filter(regex=("sample_*"))
display(f"предсказание: {int(samples.mean())}")
px.histogram(samples)
'предсказание: 1047015'
In [19]:
samples = test.loc["2020-6-1"].filter(regex=("sample_*"))
display(f"предсказание: {int(samples.mean())}")
px.histogram(samples)
'предсказание: 838242656'
In [20]:
samples = test.loc["2020-9-1"].filter(regex=("sample_*"))
display(f"предсказание: {int(samples.mean())}")
px.histogram(samples)
'предсказание: 388038639395832128'

Во второй модели необходимо учесть квадратичную зависимость. Поэтому необходимо добавить квадратичный признак. Также наличие интеграла (суммирования) подталкивает на мысль использовать данные new_cases для обучения модели, чтобы из них получать total_cases.

In [21]:
model = LinearRegression()
x_train = np.hstack(
    (train.x.to_numpy().reshape(-1, 1), train.x.to_numpy().reshape(-1, 1) ** 2)
)
y_train = train.new_cases.to_numpy().flatten()
model = model.fit(x_train, np.log(y_train))
f"ln(y)={model.intercept_:.3}+{model.coef_[0]:.2}x+{model.coef_[1]:.2}x^2"
Out[21]:
'ln(y)=-0.873+0.26x+-0.0012x^2'
In [22]:
pred_train = np.exp(model.predict(x_train))
fig = go.Figure()
fig.add_scatter(y=pred_train, name="predict", x=train.index)
fig.add_scatter(y=y_train, name="real", x=train.index)
In [23]:
sigma_noise = np.std(np.log(y_train) - np.log(pred_train))
sigma_noise  # сигма получилась больше, т.к. в данном случае целевая переменная - new_cases
Out[23]:
1.1962232272133029
In [24]:
mu, sigma = bayesian_update_loop(x=x_train, y=y_train, sigma_noise=sigma_noise)
mu, sigma
Out[24]:
(array([-0.77760687,  0.2505384 , -0.00105931]),
 array([[ 2.12555789e-01, -1.71684387e-02,  2.88989973e-04],
        [-1.71684387e-02,  1.95321068e-03, -3.76494139e-05],
        [ 2.88989973e-04, -3.76494139e-05,  7.78336878e-07]]))
In [25]:
def draw_samples(fig, data, mu, sigma, count=samples_count):
    pd.options.mode.chained_assignment = None
    np.random.seed(0)
    samples = np.random.multivariate_normal(mu, sigma, count)
    for idx, (a, b, c) in enumerate(samples):
        data.loc[:, f"sample_{idx}"] = np.cumsum(
            np.exp(
                a
                + b * data.x.to_numpy().flatten()
                + c * (data.x.to_numpy().flatten() ** 2)
            )
        )
        fig.add_scatter(
            y=data[f"sample_{idx}"],
            showlegend=False,
            line_color="grey",
            opacity=0.1,
            x=data.index,
        )
In [26]:
fig = go.Figure()
draw_samples(fig, train, mu, sigma)
fig.add_scatter(
    y=np.cumsum(pred_train), name="predict", line_color="green", x=train.index
)
fig.add_scatter(y=train.total_cases.to_numpy().flatten(), name="real", x=train.index)
fig.update_layout(title="Данные на обучении")
In [27]:
y_test = test.total_cases.to_numpy().flatten()

fig = go.Figure()
draw_samples(fig, test, mu, sigma)
fig.add_scatter(y=y_test, name="real", x=test.index)

fig.add_scatter(
    y=np.percentile(test.filter(regex=("sample_*")), 10, axis=1),
    name="10 percentile",
    x=test.index,
)
fig.add_scatter(
    y=np.percentile(test.filter(regex=("sample_*")), 90, axis=1),
    name="90 percentile",
    x=test.index,
)
fig.add_scatter(
    y=np.mean(test.filter(regex=("sample_*")), axis=1),
    name="mean",
    x=test.index,
)
fig.update_layout(title="Данные на тесте")
fig.update_layout(yaxis_range=[0, 1e6])

Естественно получилось лучше, чем экспоненциальный рост

In [28]:
def process(country_name, target, train_days=50, ylim=[0, 2e6]):
    target = target.copy()
    target["x"] = np.arange(target.shape[0])
    train = target[target.index < target.index[0] + timedelta(days=train_days)]
    target = target[target.index <= datetime(year=2020, month=10, day=1)]
    model = LinearRegression()
    x_train = np.hstack(
        (train.x.to_numpy().reshape(-1, 1), train.x.to_numpy().reshape(-1, 1) ** 2)
    )
    y_train = train.new_cases.to_numpy().flatten()
    model = model.fit(x_train, np.log(y_train))
    display(f"ln(y)={model.intercept_:.3}+{model.coef_[0]:.2}x+{model.coef_[1]:.2}x^2")

    pred_train = np.exp(model.predict(x_train))
    sigma_noise = np.std(np.log(y_train) - np.log(pred_train))

    mu, sigma = bayesian_update_loop(x=x_train, y=y_train, sigma_noise=sigma_noise)

    y = target.total_cases.to_numpy().flatten()

    fig = go.Figure()
    #     display(target)
    draw_samples(fig, target, mu, sigma)

    fig.add_scatter(y=y, name="real", x=target.index)

    fig.add_scatter(
        y=np.percentile(target.filter(regex=("sample_*")), 10, axis=1),
        name="10 percentile",
        x=target.index,
    )
    fig.add_scatter(
        y=np.percentile(target.filter(regex=("sample_*")), 90, axis=1),
        name="90 percentile",
        x=target.index,
    )
    fig.add_scatter(
        y=np.mean(target.filter(regex=("sample_*")), axis=1),
        name="mean",
        x=target.index,
    )
    fig.update_layout(title=f"Данные по {country_name}")
    fig.update_layout(yaxis_range=y_lim)
    display(fig)
    return model.intercept_, model.coef_[0], model.coef_[1]
In [29]:
data = pd.read_csv("owid-covid-data.csv")
data.new_cases[data.new_cases == 0] = 1
data["date"] = pd.to_datetime(data["date"], format="%Y-%m-%d")
data.index = data["date"]
data.drop(columns=["date"], inplace=True)
In [30]:
points = []
for country_name, y_lim in zip(
    ("Russia", "Germany", "Italy", "China"), ([0, 2e6], [0, 2e6], [0, 2e6], [0, 2e5])
):
    target = data[["total_cases", "new_cases"]]
    target = target[data["location"] == country_name]
    target = target[target.index >= target[target.total_cases > 2].index[0]]
    w0, w1, w2 = process(country_name, target)
    points.append({"name": country_name, "w0": w0, "w1": w1, "w2": w2})
'ln(y)=-0.873+0.26x+-0.0012x^2'
'ln(y)=0.77+-0.14x+0.0062x^2'
'ln(y)=-1.17+0.14x+0.0018x^2'
'ln(y)=-2.01+0.56x+-0.0079x^2'
In [31]:
pd.DataFrame(points)
Out[31]:
name w0 w1 w2
0 Russia -0.872558 0.258281 -0.001190
1 Germany 0.770067 -0.137111 0.006168
2 Italy -1.165233 0.135692 0.001794
3 China -2.014519 0.561703 -0.007927
In [32]:
px.scatter_3d(pd.DataFrame(points), x="w0", y="w1", z="w2", color="name")

Сильно отличаются Китай и Германия. Россия подобна Италии